/**********************************************************************
gc_2_clean_individual_datasets.do

**********************************************************************/
**********
* SET UP *
**********
clear all
set matsize 2000
set more 1

* location for dofiles *
cd "T:\_Projet_4915\dofiles"
global dir "`c(pwd)'"
cd $dir

*********
* GATES *
*********
* Specify which data you want to work with (synthetic = syn, real = rl) *
local ext = "rl"

* Specify which cross-sections you want to process *
local start_year = 2001
local end_year = 2012

* STEP 1: clean cities data *
local gate1 = 1

* STEP 2: clean bid to eid mapping *
local gate2 = 1

* STEP 3: clean business owners data *
local gate3 = 1

* STEP 4: clean individuals data *
local gate4 = 1

* STEP 5: merge business owners to individuals data and clean cross-section *
local gate5 = 1

* STEP 6: merge cities and clean cross-section *
local gate6 = 1

* STEP 7: delete all intermediate datasets *
local gate7 = 1

* start log file *
quietly capture log close
quietly log using gc_2_clean_individual_datasets_`ext', text replace

* specify file locations *
global project_folder "\_Projet_4915"
global data_folder "\_Projet_4915\DATA"
global output_folder "\_Projet_4915\ResultsFolder"
global temp "temp"

local datadir T:\${data_folder}\

*******************************************
* MAKE A GLOBAL VARIABLE FOR TODAY'S DATE *
*******************************************
local tyr = substr("$S_DATE",8,4)
local tmo = substr("$S_DATE",4,3)
local tmd = trim(substr("$S_DATE",1,2))

* make day of month two digits *
local wl = length("`tmd'")
if `wl'==1 {
local tmd2 ="0"+"`tmd'"
}
if `wl'!=1 {
local tmd2 ="`tmd'"
}

* get numeric month, make it two digits *
local di="`tmd2'"+"`tmo'"+"`tyr'"
local edate = date("`di'", "DMY")
local mono = month(`edate')
local ml = length("`mono'")
if `ml'==1 {
local mono2 ="0"+"`mono'"
}
if `ml'!=1 {
local mono2 ="`mono'"
}

* put final date together *
global date = "`tyr'"+"`mono2'"+"`tmd2'"

****************************************
* DEFINE LITTLE PROGRAMS TO PRINT TIME *
****************************************
program define starttime
	display "Started processing at $S_TIME on $S_DATE"
end

program define endtime
	display "Finished processing at $S_TIME on $S_DATE"
end

************************
* START OF THE PROGRAM *
************************

*****************************
* STEP 1: clean cities data *
*****************************
disp "***** Started processing STEP 1 *****"
starttime
if 1 == `gate1' ///
{
	disp "***** STEP 1: clean cities data *****"
	
	use "`datadir'PC_masterfile.dta", clear
	drop city_size
	drop if d_t1_pc == "NtFound"
	preserve
	keep cma_name cma_code pop*
	duplicates drop
	drop if missing(cma_code)
	sort pop_cma_2006
	
	gen city_size_149c = _N - _n + 1
	label var city_size_149c "City size (149 categories, based on 2006 city pop)"
	
	gen city_size_3c = 0
	replace city_size_3c = 1 if pop_cma_2006 >= 2000000
	replace city_size_3c = 2 if pop_cma_2006 < 2000000 & pop_cma_2006 >= 350000
	replace city_size_3c = 3 if pop_cma_2006 < 350000
	label var city_size_3c "City size (3 categories, based on 2006 city pop)"
	label define city_size_3c_lbl 1 "Toronto, Montreal, Vancouver" 2 "Ottawa to Halifax" 3 "Less than 350K"
	label values city_size_3c city_size_3c_lbl
	
	gen city_size_4c = 0
	replace city_size_4c = 1 if pop_cma_2006 >= 2000000
	replace city_size_4c = 2 if pop_cma_2006 < 2000000 & pop_cma_2006 >= 500000
	replace city_size_4c = 3 if pop_cma_2006 < 500000 & pop_cma_2006 >= 100000
	replace city_size_4c = 4 if pop_cma_2006 < 100000
	label var city_size_4c "City size (4 categories, based on 2006 city pop)"
	label define city_size_4c_lbl 1 "Toronto, Montreal, Vancouver" 2 "Ott., Cal., Edm., Qc., Win., Ham." 3 "From London to Cape-Breton" 4 "Less than 100K"
	label values city_size_4c city_size_4c_lbl
	
	gen city_size_5c = 0
	replace city_size_5c = 1 if pop_cma_2006 >= 5000000
	replace city_size_5c = 2 if pop_cma_2006 < 5000000 & pop_cma_2006 >= 3000000
	replace city_size_5c = 3 if pop_cma_2006 < 3000000 & pop_cma_2006 >= 2000000
	replace city_size_5c = 4 if pop_cma_2006 < 2000000 & pop_cma_2006 >= 350000
	replace city_size_5c = 5 if pop_cma_2006 < 350000
	label var city_size_5c "City size (5 categories, based on 2006 city pop)"
	label define city_size_5c_lbl 1 "Toronto" 2 "Montreal" 3 "Vancouver" 4 "Ottawa to Halifax" 5 "Less than 350K"
	label values city_size_5c city_size_5c_lbl
	
	gen city_size_6c = 0
	replace city_size_6c = 1 if pop_cma_2006 >= 5000000
	replace city_size_6c = 2 if pop_cma_2006 < 5000000 & pop_cma_2006 >= 3000000
	replace city_size_6c = 3 if pop_cma_2006 < 3000000 & pop_cma_2006 >= 2000000
	replace city_size_6c = 4 if pop_cma_2006 < 2000000 & pop_cma_2006 >= 500000
	replace city_size_6c = 5 if pop_cma_2006 < 500000 & pop_cma_2006 >= 100000
	replace city_size_6c = 6 if pop_cma_2006 < 100000
	label var city_size_6c "City size (6 categories, based on 2006 city pop)"
	label define city_size_6c_lbl 1 "Toronto" 2 "Montreal" 3 "Vancouver" 4 "Ott., Cal., Edm., Qc., Win., Ham." 5 "From London to Cape-Breton" 6 "Less than 100K"
	label values city_size_6c city_size_6c_lbl

	keep cma_code pop_cma_2001 pop_cma_2006 pop_cma_2011 city*
	order cma_code pop_cma_2001 pop_cma_2006 pop_cma_2011 city*
	
	* save intermediate dataset to the data_folder *
	compress
	save "`datadir'gc_cma_census_`ext'.dta", replace
	restore
	keep d_t1_pc prov cma_name cma_code
	merge 1:1 d_t1_pc using "`datadir'geogcor.dta"

	rename d_t1_pc pc
	replace prov = "NL" if pruid == 10 & _merge == 2
	replace prov = "PE" if pruid == 11 & _merge == 2
	replace prov = "NS" if pruid == 12 & _merge == 2
	replace prov = "NB" if pruid == 13 & _merge == 2
	replace prov = "QC" if pruid == 24 & _merge == 2
	replace prov = "ON" if pruid == 35 & _merge == 2
	replace prov = "MB" if pruid == 46 & _merge == 2
	replace prov = "SK" if pruid == 47 & _merge == 2
	replace prov = "AB" if pruid == 48 & _merge == 2
	replace prov = "BC" if pruid == 59 & _merge == 2
	replace prov = "YT" if pruid == 60 & _merge == 2
	replace prov = "NT" if pruid == 61 & _merge == 2
	replace prov = "NU" if pruid == 62 & _merge == 2
	replace cma_code = cmapuid if _merge == 2
	replace cma_code = cmapuid if cma_code == "" & cmapuid != ""
	replace cma_name = cmaname if _merge == 2
	replace cma_name = cmaname if cma_name == "" & cmaname != ""
	
	keep pc prov cma_name cma_code
	
	replace cma_code = "35505" if cma_code == "24505"
	replace cma_code = "48840" if cma_code == "47840"
	
	merge m:1 cma_code using "`datadir'gc_cma_census_`ext'.dta"
	drop _merge	
	* save intermediate dataset to the data_folder *
	compress
	save "`datadir'gc_pc_to_cma_`ext'.dta", replace

	clear
	disp "***** STEP 1: clean cities data (COMPLETED)*****"
}
disp "***** Finished processing STEP 1 *****"
endtime

************************************
* STEP 2: clean bid to eid mapping *
************************************
disp "***** Started processing STEP 2 *****"
starttime
if 1 == `gate2' ///
{
	disp "***** STEP 2: clean bid to eid mapping *****"
	
	forvalues y = `start_year'(1)`end_year' ///
	{
		/* bid and tax_yr should uniquely identify observations. However, there is one instance
		in 2008 where a single bid is associated with more than one eid in the same year. I drop both bids in this case. */
		* keep only one eid per bid *
		use "`datadir'Entid_bn_`y'.dta", clear
		sort bid
		by bid: gen aux = _n
		by bid: egen maux = max(aux)
		tab maux
		drop if maux > 1
		drop aux maux
		* save intermediate dataset to the data_folder *
		compress
		save "`datadir'gc_Entid_bn_`y'_clean.dta", replace
		clear
	}
	disp "***** STEP 2: clean bid to eid mapping (COMPLETED) *****"
}
disp "***** Finished processing STEP 2 *****"
endtime

**************************************
* STEP 3: clean business owners data *
**************************************
disp "***** Started processing STEP 3 *****"
starttime
if 1 == `gate3' ///
{
	disp "***** STEP 3: clean business owners data *****"
	
	forvalues y = `start_year'(1)`end_year' ///
	{
		use "`datadir'SE_all_`y'_`ext'.dta", clear
		
		* create bid *
		gen bid = bid2
		replace bid = bid1 if bid == ""
		
		****************************
		*drop dead business owners *
		****************************
		drop if t1h_death_cd == "D"
		drop if pid == ""
		
		sort pid bid
		
		/*
		* limit the size of the dataset to speed up debugging *
		gen limit_sample = _n
		keep if limit_sample <= 10000
		*/
		
		* fill in tax year *
		gen tax_yr = `y'
		
		* merge eid information *
		merge m:1 bid using "`datadir'gc_Entid_bn_`y'_clean.dta"
		drop if _merge == 2
		drop _merge
		
		*******************************************
		* calculate some totals by business owner *
		*******************************************
		sort pid bid
		replace t1h_bus_inc_gross = 0 if missing(t1h_bus_inc_gross)
		replace t1h_bus_inc_net = 0 if missing(t1h_bus_inc_net)
		replace t4earn_bn = 0 if missing(t4earn_bn)
		replace bus_earn_bn = 0 if missing(bus_earn_bn)
		bys pid: egen tot_busi_t4 = total(t4earn_bn)
		bys pid: egen tot_busi_3680 = total(bus_earn_bn)

		***********************************************************
		* calculate some totals by business owner across startups *
		***********************************************************
		if `y' != 2001 ///
		{
			* merge startup information *
			merge m:1 eid tax_yr using "`datadir'gc_startup_panel_`ext'.dta"
			drop if _merge == 2
			gen t4earn_bn_startups = t4earn_bn
			replace t4earn_bn_startups = 0 if _merge == 1
			gen bus_earn_bn_startups = bus_earn_bn
			replace bus_earn_bn_startups = 0 if _merge == 1
			drop _merge
			sort pid bid
			bys pid: egen tot_busi_t4_startups = total(t4earn_bn_startups)
			bys pid: egen tot_busi_3680_startups = total(bus_earn_bn_startups)
		}
		if `y' == 2001 ///
		{
			gen tot_busi_t4_startups = 0
			gen tot_busi_3680_startups = 0
		}
		
		**************************************************************************
		* keep only one observation per pid and keep track of relevant firm eids *
		**************************************************************************
		if `y' != 2001 ///
		{
			* if there are multiple firms associated with a single business owner, keep track of eid associated with most employment income among startups (using t4earn_bn_startups) *
			replace t4earn_bn_startups = . if t4earn_bn_startups == 0
			bys pid: egen max_t4earn = max(t4earn_bn_startups)
			gen flag1 = 0
			replace flag1 = 1 if round(t4earn_bn_startups,.01) == round(max_t4earn,.01) & t4earn_bn_startups != .
			destring naics4_t2, gen(naics4_int_aux)
			sort pid flag1 naics4_int_aux
			bys pid flag1: gen counter = _n
			gen eid_startup = ""
			replace eid_startup = eid if flag1 == 1 & counter == 1
			drop if flag1 == 1 & counter != 1
			drop max_t4earn counter
			count if flag1 == 1
			
			replace t4earn_bn = . if t4earn_bn == 0
			bys pid: egen max_t4earn = max(t4earn_bn)
			gen flag2 = 0
			replace flag2 = 1 if round(t4earn_bn,.01) == round(max_t4earn,.01) & t4earn_bn != .
			sort pid flag2 naics4_int_aux
			bys pid flag2: gen counter = _n
			gen eid_all = ""
			replace eid_all = eid if flag2 == 1 & counter == 1
			drop if flag2 == 1 & counter != 1 & flag1 == 0
			drop max_t4earn counter
			count if flag2 == 1
			
			gen final_sort = 99
			replace final_sort = 2 if flag2 == 1
			replace final_sort = 1 if flag1 == 1
			sort pid final_sort
			by pid: gen counter2 = _n
			keep if counter2 == 1
			drop flag1 flag2 final_sort counter2 eid
		}
		
		if `y' == 2001 ///
		{
			gen flag1 = 0
			destring naics4_t2, gen(naics4_int_aux)
			sort pid flag1 naics4_int_aux
			bys pid flag1: gen counter = _n
			gen eid_startup = ""
			replace eid_startup = eid if flag1 == 1 & counter == 1
			drop if flag1 == 1 & counter != 1
			drop counter
			count if flag1 == 1
			
			replace t4earn_bn = . if t4earn_bn == 0
			bys pid: egen max_t4earn = max(t4earn_bn)
			gen flag2 = 0
			replace flag2 = 1 if round(t4earn_bn,.01) == round(max_t4earn,.01) & t4earn_bn != .
			sort pid flag2 naics4_int_aux
			bys pid flag2: gen counter = _n
			gen eid_all = ""
			replace eid_all = eid if flag2 == 1 & counter == 1
			drop if flag2 == 1 & counter != 1 & flag1 == 0
			drop max_t4earn counter
			count if flag2 == 1
			
			gen final_sort = 99
			replace final_sort = 2 if flag2 == 1
			replace final_sort = 1 if flag1 == 1
			sort pid final_sort
			by pid: gen counter2 = _n
			keep if counter2 == 1
			drop flag1 flag2 final_sort counter2 eid
		}
		
		* keep key variables *
		keep pid tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb d_t1_pc ///
		eid_all eid_startup ///
		t1h_bus_inc_gross t1h_bus_inc_net tot_busi_t4 tot_busi_3680 tot_busi_t4_startups tot_busi_3680_startups
		
		* order key variables *
		order pid tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb d_t1_pc ///
		eid_all eid_startup ///
		t1h_bus_inc_gross t1h_bus_inc_net tot_busi_t4 tot_busi_3680 tot_busi_t4_startups tot_busi_3680_startups

		* save intermediate dataset to the data_folder *
		compress
		save "`datadir'gc_SE_all_`y'_`ext'_clean.dta", replace
		clear

		* delete intermediate datasets as I go *
		capture erase "`datadir'gc_Entid_bn_`y'_clean.dta"
	}
	
	disp "***** STEP 3: clean business owners data (COMPLETED) *****"
}
disp "***** Finished processing STEP 3 *****"
endtime

**********************************
* STEP 4: clean individuals data *
**********************************
disp "***** Started processing STEP 4 *****"
starttime
if 1 == `gate4' ///
{
	disp "***** STEP 4: clean individuals data *****"
	
	forvalues y = `start_year'(1)`end_year' ///
	{
		use "`datadir'Individuals_`y'_`ext'.dta", clear
		
		sort pid eid
		drop if pid == ""
		
		/*
		* limit the size of the dataset for debugging *
		gen limit_sample = _n
		keep if limit_sample <= 5000
		*/

		*************************************
		* calculate some totals by employee *
		*************************************
		sort pid eid
		foreach var in ///
		"emp_inc" ///
		{
			bys pid: egen tot_`var' = total(`var')
		}
		
		*******************************
		* keep only the main employer *
		*******************************
		* if there are multiple employers associated with a single worker, keep only the one with the most employment income (using emp_inc) *
		replace emp_inc = . if emp_inc == 0
		bys pid: egen max_emp_inc = max(emp_inc)
		keep if round(emp_inc,.01) == round(max_emp_inc,.01)
		drop max_emp_inc

		* if there is a tie, keep employers with non-missing eid
		sort pid eid
		bys pid: gen count = _n
		bys pid: gen max_count = _N
		drop if count < max_count & eid == ""
		drop count max_count
		
		* if there is still a tie, keep the first one *
		bys pid: gen aux = _n
		keep if aux == 1
		drop aux
		
		rename eid eid_Ind
		
		* keep key variables *
		keep pid tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb d_t1_pc ///
		eid_Ind ///
		t1h_earn_t4 t1h_tot_inc_calc emp_inc tot_emp_inc
		
		* order key variables *
		order pid tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb d_t1_pc ///
		eid_Ind ///
		t1h_earn_t4 t1h_tot_inc_calc emp_inc tot_emp_inc
	
		* save intermediate dataset to the data_folder *
		compress
		save "`datadir'gc_Individuals_`y'_`ext'_clean.dta", replace
		clear
	}
	
	disp "***** STEP 4: clean individuals data (COMPLETED) *****"
}
disp "***** Finished processing STEP 4 *****"
endtime

*****************************************************************************
* STEP 5: merge business owners to individuals data and clean cross-section *
*****************************************************************************
disp "***** Started processing STEP 5 *****"
starttime
if 1 == `gate5' ///
{
	disp "***** STEP 5: merge business owners to individuals data *****"
	
	forvalues y = `start_year'(1)`end_year' ///
	{
		use "`datadir'gc_Individuals_`y'_`ext'_clean.dta", clear
		
		merge 1:1 pid using "`datadir'gc_SE_all_`y'_`ext'_clean.dta"

		**************************************************************
		* Construct the relevant income measures for each occupation *
		* y_0 cutoff is defined as 26 weeks X 40 hours X 10$ / hour	 *
		**************************************************************
		gen y_0 = 10400
		gen y_1 = 0
		gen y_2 = 0
		gen y_3_all = 0
		gen y_3_startups = 0
		
		******************************************************************
		* convert y_0 cutoff into constant 2012 dollars                  *
		* CPI is from Bank of Canada's core index: CANSIM Table 326-0021 *
		******************************************************************
		foreach var in ///
		"y_0" ///
		{
			replace `var' = `var'/1.223132037 if tax_yr == 2001
			replace `var' = `var'/1.195 if tax_yr == 2002
			replace `var' = `var'/1.16927593 if tax_yr == 2003
			replace `var' = `var'/1.151252408 if tax_yr == 2004
			replace `var' = `var'/1.132701422 if tax_yr == 2005
			replace `var' = `var'/1.111627907 if tax_yr == 2006
			replace `var' = `var'/1.088342441 if tax_yr == 2007
			replace `var' = `var'/1.069829902 if tax_yr == 2008
			replace `var' = `var'/1.05193662 if tax_yr == 2009
			replace `var' = `var'/1.033737024 if tax_yr == 2010
			replace `var' = `var'/1.017021277 if tax_yr == 2011
			replace `var' = `var'/1 if tax_yr == 2012
		}

		* clean all the variables necessary to construct the relevant income measures for each occupation *
		foreach var in "t1h_earn_t4" "t1h_tot_inc_calc" "emp_inc" "tot_emp_inc" "t1h_bus_inc_gross" "t1h_bus_inc_net" "tot_busi_t4" "tot_busi_3680" "tot_busi_t4_startups" "tot_busi_3680_startups"  ///
		{
			*gen `var'_og = `var'
			replace `var' = 0 if missing(`var')
		}
		
		* some individuals are in a t4 but don't have any info on their t1. I give them tot_emp_inc as employment earnings, which comes from the t4 *
		replace t1h_earn_t4 = tot_emp_inc if t1h_earn_t4 <= tot_emp_inc & tot_emp_inc != .
		
		************************************************************************
		* First, define the income measures for those who are in both datasets *
		************************************************************************
		* paid-employment income is total employment income minus employment income that comes from your own businesses *
		replace y_1 = max(t1h_earn_t4-tot_busi_t4,0) if _merge == 3
		* unincorporated self-employment income is net_se_inc *
		replace y_2 = t1h_bus_inc_net if _merge == 3
		* entrepreneurship income is employment income that comes from your own businesses *
		replace y_3_all = tot_busi_t4 if _merge == 3
		replace y_3_startups = tot_busi_t4_startups if _merge == 3
		
		********************************************************************************************
		* Second, define the income measures for those who are only in the business owners dataset *
		********************************************************************************************
		* paid-employment income is zero *
		replace y_1 = 0 if _merge == 2
		* self-employment income is net_se_inc *
		replace y_2 = t1h_bus_inc_net if _merge == 2
		* entrepreneurship income is employment income that comes from your own businesses *
		replace y_3_all = tot_busi_t4 if _merge == 2
		replace y_3_startups = tot_busi_t4_startups if _merge == 2
		
		***************************************************************************************
		* Third, define the income measures for those who are only in the individuals dataset *
		***************************************************************************************
		* paid-employment income is total employment income *
		replace y_1 = t1h_earn_t4 if _merge == 1
		* self-employment income is zero *
		replace y_2 = 0 if _merge == 1
		* entrepreneurship income is zero *
		replace y_3_all = 0 if _merge == 1
		replace y_3_startups = 0 if _merge == 1
		
		label var y_0 "Nonemployment income, assumed to be equal to 10400 in 2012"
		label var y_1 "Employment income"
		label var y_2 "Unincorporated income"
		label var y_3_all "Incorporated income, all firms"
		label var y_3_startups "Incorporated income, startups only"
		
		***************************************
		* Construct the relevant postal codes *
		***************************************
		gen home_pc = ""
		replace home_pc = d_t1_pc if d_t1_pc != "" & d_t1_pc != "NV"
		
		* keep key variables *
		keep pid eid* tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb home_pc ///
		t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
		y*
		
		* order key variables *
		order pid eid* tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb home_pc ///
		t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
		y*

		* save intermediate dataset to the data_folder *
		sort pid
		isid pid
		compress
		save "`datadir'gc_cross_section_`y'_`ext'.dta", replace
		clear

		* delete large intermediate datasets as I go *
		capture erase "`datadir'gc_SE_all_`y'_`ext'_clean.dta"
		capture erase "`datadir'gc_Individuals_`y'_`ext'_clean.dta"
	}
	
	disp "***** STEP 5: merge business owners to individuals data (COMPLETED) *****"
}
disp "***** Finished processing STEP 5 *****"
endtime

*****************************************
* STEP 6: merge cities to cross-section *
*****************************************
disp "***** Started processing STEP 6 *****"
starttime
if 1 == `gate6' ///
{
	disp "***** STEP 6: merge cities to cross-section *****"
	forvalues y = `start_year'(1)`end_year' ///
	{
		use "`datadir'gc_cross_section_`y'_`ext'.dta", clear
		
		*************************************************************
		* First, I merge city information based on home postal code *
		*************************************************************
		rename home_pc pc
		merge m:1 pc using "`datadir'gc_pc_to_cma_`ext'.dta"
		drop if _merge == 2
		drop _merge
		foreach var in "pc" "prov" "cma_name" "cma_code" "city_size_3c"  "city_size_4c"  "city_size_5c"  "city_size_6c" ///
		{
			rename `var' res_`var'
		}
		
		* Individuals that don't belong to any CMA are assigned the "Rural" category *
		replace res_cma_name = "Rural" if res_cma_name == ""
		replace res_cma_code = "99999" if res_cma_code == ""

		* calculate average employment income in each postal code *
		bys res_pc: egen res_pc_inc = mean(t1h_earn_t4)
		
		* calculate number of people and share of business owners in each postal code *
		gen count_all = 1
		gen count_busi_t4 = 0
		replace count_busi_t4 = 1 if tot_busi_t4>0
		bys res_pc: egen res_pc_pop = total(count_all)
		bys res_pc: egen res_pc_busipop = total(count_busi_t4)
		gen res_pc_shr_bus = res_pc_busipop/res_pc_pop
		drop count*

		* keep key variables *
		keep pid eid* tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb res* ///
		t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
		y*
		
		* order key variables *
		order pid eid* tax_yr ///
		t1h_sex t1h_dv_age t1h_marst inimdb res* ///
		t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
		y*

		* save intermediate dataset to the data_folder *
		sort pid
		compress
		save "`datadir'gc_cross_section_`y'_`ext'_w_cities.dta", replace
		clear
		
		* delete large intermediate datasets as I go *
		capture erase "`datadir'gc_cross_section_`y'_`ext'.dta"
	}
	disp "***** STEP 6: merge cities to cross-section (COMPLETED) *****"
}
disp "***** Finished processing STEP 6 *****"
endtime

********************************************
* STEP 7: delete all intermediate datasets *
********************************************
disp "***** Started processing STEP 7 *****"
starttime
if 1 == `gate7' ///
{
	disp "***** STEP 7: delete all intermediate datasets *****"
	
	capture erase "`datadir'gc_Enterprise_postcodes_clean.dta"
	
	disp "***** STEP 7: delete all intermediate datasets (COMPLETED) *****"
}
disp "***** Finished processing STEP 7 *****"
endtime

*********************
* CLEAN UP AND EXIT *
*********************
clear all
log close
